# 内容读取
import xlrd
import matplotlib.pyplot as plt  # 数学绘图库
import jieba  # 分词库
from wordcloud import WordCloud  # 词云库
import numpy
import PIL.Image as Image  # 图像转换
import jieba.analyse
import xlwt


wb = xlrd.open_workbook("Iphone12评论信息.xls")
sh = wb.sheet_by_index(0)
col = sh.ncols
row = sh.nrows
Text = []
for i in range(row):
    Text_Context = sh.row_values(i, 1, 2)[0]
    Text.append(Text_Context)
del Text[0]

# 结巴分词
# 结巴分词为list
cut_text = jieba.cut(str(Text))
result = list(cut_text)
# 文本清洗
# 加载停用词
stop = []
stop_content = open(r"停用词.txt", 'rb').read()
text = jieba.cut(stop_content)
for i in text:
    stop.append(''.join(i))
# print(stop)

# 词频清洗
clean_data = []
for item in result:
    if item not in stop:
        is_chinses = True
        # 判断是否为汉字
        for ch in item:
            if not ('\u4e00' <= ch <= '\u9fff'):
                is_chinses = False
        if is_chinses and len(item) > 1:
            clean_data.append(item)

# workbook = xlwt.Workbook()
# sheet = workbook.add_sheet("Sheet")
# # 标题
# # for item in range(0,13):
# #     sheet.write(0, item - 1, 1)
# for i, item in enumerate(clean_data):
#     sheet.write(i+1, 0, item)
# workbook.save("数据清洗" + ".xls")
with open("数据清洗数据.txt","w") as f:
    for item in clean_data:
        f.write(item + " ")
